import os
import pandas as pd

# Initialize the list to store file paths and metadata
data = []

# Define the base directory
base_dir = 'data/'

# Walk through the directories
for root, dirs, files in os.walk(base_dir):
    for file in files:
        # Only process files that follow the expected pattern {GENDER}_{RACE}_{CATEGORY}.jpg
        if file.endswith('.jpg') and len(file.split('_')) == 3:
            try:
                # Extract the gender, profession, and index from the folder path
                folder_name = os.path.basename(root)
                gender_profession_index = folder_name.split('_')
                gender = gender_profession_index[0]
                profession = gender_profession_index[1]
                index = gender_profession_index[2]

                # Extract the gender, race, and category from the filename
                file_name_parts = file.split('_')
                file_gender = file_name_parts[0]
                race = file_name_parts[1]
                category = file_name_parts[2].replace('.jpg', '')

                # Save the metadata and the file path
                file_path = os.path.join(root, file)
                data.append([file_path, gender, profession, index, file_gender, race, category])

            except IndexError:
                # Skip files that do not match the expected pattern
                continue

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['file_path', 'folder_gender', 'profession', 'index', 'file_gender', 'race', 'category'])

# The DataFrame is now ready for further use
df.to_csv("generated_image_list.csv",index=False)